{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "data = pd.read_csv('abcnews-date-text.csv', error_bad_lines=False);\n",
    "data_text = data[['headline_text']]\n",
    "data_text['index'] = data_text.index\n",
    "documents = data_text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1048575"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(documents)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>headline_text</th>\n",
       "      <th>index</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>aba decides against community broadcasting lic...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>act fire witnesses must be aware of defamation</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>a g calls for infrastructure protection summit</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>air nz staff in aust strike for pay rise</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>air nz strike to affect australian travellers</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       headline_text  index\n",
       "0  aba decides against community broadcasting lic...      0\n",
       "1     act fire witnesses must be aware of defamation      1\n",
       "2     a g calls for infrastructure protection summit      2\n",
       "3           air nz staff in aust strike for pay rise      3\n",
       "4      air nz strike to affect australian travellers      4"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "documents[:5]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Data Preprocessing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import gensim\n",
    "from gensim.utils import simple_preprocess\n",
    "from gensim.parsing.preprocessing import STOPWORDS\n",
    "from nltk.stem import WordNetLemmatizer, SnowballStemmer\n",
    "from nltk.stem.porter import *\n",
    "import numpy as np\n",
    "np.random.seed(2018)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package wordnet to\n",
      "[nltk_data]     C:\\Users\\SusanLi\\AppData\\Roaming\\nltk_data...\n",
      "[nltk_data]   Package wordnet is already up-to-date!\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import nltk\n",
    "nltk.download('wordnet')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Lemmatize example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "go\n"
     ]
    }
   ],
   "source": [
    "print(WordNetLemmatizer().lemmatize('went', pos='v'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Stemmer Example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>original word</th>\n",
       "      <th>stemmed</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>caresses</td>\n",
       "      <td>caress</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>flies</td>\n",
       "      <td>fli</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>dies</td>\n",
       "      <td>die</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>mules</td>\n",
       "      <td>mule</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>denied</td>\n",
       "      <td>deni</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>died</td>\n",
       "      <td>die</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>agreed</td>\n",
       "      <td>agre</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>owned</td>\n",
       "      <td>own</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>humbled</td>\n",
       "      <td>humbl</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>sized</td>\n",
       "      <td>size</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>meeting</td>\n",
       "      <td>meet</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>stating</td>\n",
       "      <td>state</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>siezing</td>\n",
       "      <td>siez</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>itemization</td>\n",
       "      <td>item</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>sensational</td>\n",
       "      <td>sensat</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>traditional</td>\n",
       "      <td>tradit</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>reference</td>\n",
       "      <td>refer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>colonizer</td>\n",
       "      <td>colon</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>plotted</td>\n",
       "      <td>plot</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   original word stemmed\n",
       "0       caresses  caress\n",
       "1          flies     fli\n",
       "2           dies     die\n",
       "3          mules    mule\n",
       "4         denied    deni\n",
       "5           died     die\n",
       "6         agreed    agre\n",
       "7          owned     own\n",
       "8        humbled   humbl\n",
       "9          sized    size\n",
       "10       meeting    meet\n",
       "11       stating   state\n",
       "12       siezing    siez\n",
       "13   itemization    item\n",
       "14   sensational  sensat\n",
       "15   traditional  tradit\n",
       "16     reference   refer\n",
       "17     colonizer   colon\n",
       "18       plotted    plot"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stemmer = SnowballStemmer('english')\n",
    "original_words = ['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', \n",
    "           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', \n",
    "           'traditional', 'reference', 'colonizer','plotted']\n",
    "singles = [stemmer.stem(plural) for plural in original_words]\n",
    "pd.DataFrame(data = {'original word': original_words, 'stemmed': singles})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def lemmatize_stemming(text):\n",
    "    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))\n",
    "\n",
    "def preprocess(text):\n",
    "    result = []\n",
    "    for token in gensim.utils.simple_preprocess(text):\n",
    "        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:\n",
    "            result.append(lemmatize_stemming(token))\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "original document: \n",
      "['rain', 'helps', 'dampen', 'bushfires']\n",
      "\n",
      "\n",
      " tokenized and lemmatized document: \n",
      "['rain', 'help', 'dampen', 'bushfir']\n"
     ]
    }
   ],
   "source": [
    "doc_sample = documents[documents['index'] == 4310].values[0][0]\n",
    "\n",
    "print('original document: ')\n",
    "words = []\n",
    "for word in doc_sample.split(' '):\n",
    "    words.append(word)\n",
    "print(words)\n",
    "print('\\n\\n tokenized and lemmatized document: ')\n",
    "print(preprocess(doc_sample))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "processed_docs = documents['headline_text'].map(preprocess)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0            [decid, communiti, broadcast, licenc]\n",
       "1                               [wit, awar, defam]\n",
       "2           [call, infrastructur, protect, summit]\n",
       "3                      [staff, aust, strike, rise]\n",
       "4             [strike, affect, australian, travel]\n",
       "5               [ambiti, olsson, win, tripl, jump]\n",
       "6           [antic, delight, record, break, barca]\n",
       "7    [aussi, qualifi, stosur, wast, memphi, match]\n",
       "8            [aust, address, secur, council, iraq]\n",
       "9                         [australia, lock, timet]\n",
       "Name: headline_text, dtype: object"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "processed_docs[:10]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Bag of words on the dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "dictionary = gensim.corpora.Dictionary(processed_docs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0 broadcast\n",
      "1 communiti\n",
      "2 decid\n",
      "3 licenc\n",
      "4 awar\n",
      "5 defam\n",
      "6 wit\n",
      "7 call\n",
      "8 infrastructur\n",
      "9 protect\n",
      "10 summit\n"
     ]
    }
   ],
   "source": [
    "count = 0\n",
    "for k, v in dictionary.iteritems():\n",
    "    print(k, v)\n",
    "    count += 1\n",
    "    if count > 10:\n",
    "        break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(76, 1), (112, 1), (483, 1), (3998, 1)]"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]\n",
    "bow_corpus[4310]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Word 76 (\"bushfir\") appears 1 time.\n",
      "Word 112 (\"help\") appears 1 time.\n",
      "Word 483 (\"rain\") appears 1 time.\n",
      "Word 3998 (\"dampen\") appears 1 time.\n"
     ]
    }
   ],
   "source": [
    "bow_doc_4310 = bow_corpus[4310]\n",
    "\n",
    "for i in range(len(bow_doc_4310)):\n",
    "    print(\"Word {} (\\\"{}\\\") appears {} time.\".format(bow_doc_4310[i][0], \n",
    "                                                     dictionary[bow_doc_4310[i][0]], \n",
    "                                                     bow_doc_4310[i][1]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### TF-IDF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "from gensim import corpora, models\n",
    "\n",
    "tfidf = models.TfidfModel(bow_corpus)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "corpus_tfidf = tfidf[bow_corpus]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(0, 0.5907943557842693),\n",
      " (1, 0.3900924708457926),\n",
      " (2, 0.49514546614015836),\n",
      " (3, 0.5036078441840635)]\n"
     ]
    }
   ],
   "source": [
    "from pprint import pprint\n",
    "\n",
    "for doc in corpus_tfidf:\n",
    "    pprint(doc)\n",
    "    break"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Running LDA using Bag of Words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Topic: 0 \n",
      "Words: 0.035*\"govern\" + 0.024*\"open\" + 0.018*\"coast\" + 0.017*\"tasmanian\" + 0.017*\"gold\" + 0.014*\"australia\" + 0.013*\"beat\" + 0.010*\"win\" + 0.010*\"ahead\" + 0.009*\"shark\"\n",
      "Topic: 1 \n",
      "Words: 0.023*\"world\" + 0.014*\"final\" + 0.013*\"record\" + 0.012*\"break\" + 0.011*\"lose\" + 0.011*\"australian\" + 0.011*\"leagu\" + 0.011*\"test\" + 0.010*\"australia\" + 0.010*\"hill\"\n",
      "Topic: 2 \n",
      "Words: 0.018*\"rural\" + 0.018*\"council\" + 0.015*\"fund\" + 0.014*\"plan\" + 0.013*\"health\" + 0.012*\"chang\" + 0.011*\"nation\" + 0.010*\"price\" + 0.010*\"servic\" + 0.009*\"say\"\n",
      "Topic: 3 \n",
      "Words: 0.025*\"elect\" + 0.022*\"adelaid\" + 0.012*\"perth\" + 0.011*\"take\" + 0.011*\"say\" + 0.010*\"labor\" + 0.010*\"turnbul\" + 0.009*\"vote\" + 0.009*\"royal\" + 0.009*\"time\"\n",
      "Topic: 4 \n",
      "Words: 0.032*\"court\" + 0.022*\"face\" + 0.020*\"charg\" + 0.020*\"home\" + 0.018*\"tasmania\" + 0.017*\"murder\" + 0.015*\"trial\" + 0.012*\"accus\" + 0.012*\"abus\" + 0.012*\"child\"\n",
      "Topic: 5 \n",
      "Words: 0.024*\"countri\" + 0.021*\"hour\" + 0.020*\"australian\" + 0.019*\"warn\" + 0.016*\"live\" + 0.013*\"indigen\" + 0.011*\"call\" + 0.009*\"victorian\" + 0.009*\"campaign\" + 0.008*\"show\"\n",
      "Topic: 6 \n",
      "Words: 0.027*\"south\" + 0.024*\"year\" + 0.020*\"interview\" + 0.020*\"north\" + 0.019*\"jail\" + 0.018*\"west\" + 0.014*\"island\" + 0.013*\"australia\" + 0.013*\"victoria\" + 0.010*\"china\"\n",
      "Topic: 7 \n",
      "Words: 0.031*\"queensland\" + 0.029*\"melbourn\" + 0.018*\"water\" + 0.017*\"claim\" + 0.013*\"hunter\" + 0.012*\"green\" + 0.012*\"resid\" + 0.011*\"darwin\" + 0.010*\"young\" + 0.009*\"plead\"\n",
      "Topic: 8 \n",
      "Words: 0.017*\"attack\" + 0.016*\"kill\" + 0.012*\"victim\" + 0.012*\"violenc\" + 0.010*\"hobart\" + 0.010*\"rugbi\" + 0.010*\"secur\" + 0.010*\"say\" + 0.009*\"state\" + 0.008*\"domest\"\n",
      "Topic: 9 \n",
      "Words: 0.052*\"polic\" + 0.020*\"crash\" + 0.019*\"death\" + 0.017*\"sydney\" + 0.016*\"miss\" + 0.016*\"woman\" + 0.015*\"die\" + 0.015*\"charg\" + 0.014*\"shoot\" + 0.013*\"arrest\"\n"
     ]
    }
   ],
   "source": [
    "for idx, topic in lda_model.print_topics(-1):\n",
    "    print('Topic: {} \\nWords: {}'.format(idx, topic))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Cool! Can you distinguish different topics using the words in each topic and their corresponding weights?"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Running LDA using TF-IDF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Topic: 0 Word: 0.008*\"octob\" + 0.006*\"search\" + 0.006*\"miss\" + 0.006*\"inquest\" + 0.005*\"stori\" + 0.005*\"jam\" + 0.004*\"john\" + 0.004*\"harvest\" + 0.004*\"australia\" + 0.004*\"world\"\n",
      "Topic: 1 Word: 0.006*\"action\" + 0.006*\"violenc\" + 0.006*\"thursday\" + 0.005*\"domest\" + 0.005*\"cancer\" + 0.005*\"legal\" + 0.005*\"union\" + 0.005*\"breakfast\" + 0.005*\"school\" + 0.004*\"student\"\n",
      "Topic: 2 Word: 0.023*\"rural\" + 0.018*\"govern\" + 0.013*\"news\" + 0.012*\"podcast\" + 0.008*\"grandstand\" + 0.008*\"health\" + 0.007*\"budget\" + 0.007*\"busi\" + 0.007*\"nation\" + 0.007*\"fund\"\n",
      "Topic: 3 Word: 0.030*\"countri\" + 0.028*\"hour\" + 0.009*\"sport\" + 0.008*\"septemb\" + 0.008*\"wednesday\" + 0.007*\"commiss\" + 0.006*\"royal\" + 0.006*\"updat\" + 0.006*\"station\" + 0.005*\"bendigo\"\n",
      "Topic: 4 Word: 0.014*\"south\" + 0.009*\"weather\" + 0.009*\"north\" + 0.008*\"west\" + 0.008*\"coast\" + 0.008*\"australia\" + 0.006*\"east\" + 0.006*\"queensland\" + 0.006*\"storm\" + 0.005*\"season\"\n",
      "Topic: 5 Word: 0.008*\"monday\" + 0.008*\"august\" + 0.006*\"babi\" + 0.005*\"shorten\" + 0.005*\"hobart\" + 0.004*\"victorian\" + 0.004*\"donald\" + 0.004*\"safe\" + 0.004*\"scott\" + 0.004*\"donat\"\n",
      "Topic: 6 Word: 0.022*\"interview\" + 0.013*\"market\" + 0.009*\"share\" + 0.008*\"cattl\" + 0.008*\"trump\" + 0.008*\"turnbul\" + 0.007*\"novemb\" + 0.007*\"michael\" + 0.006*\"australian\" + 0.006*\"export\"\n",
      "Topic: 7 Word: 0.019*\"crash\" + 0.014*\"kill\" + 0.009*\"fatal\" + 0.009*\"dead\" + 0.007*\"die\" + 0.007*\"truck\" + 0.007*\"polic\" + 0.006*\"attack\" + 0.006*\"injur\" + 0.006*\"bomb\"\n",
      "Topic: 8 Word: 0.008*\"drum\" + 0.007*\"abbott\" + 0.007*\"farm\" + 0.006*\"dairi\" + 0.006*\"asylum\" + 0.006*\"tuesday\" + 0.006*\"water\" + 0.006*\"labor\" + 0.006*\"say\" + 0.005*\"plan\"\n",
      "Topic: 9 Word: 0.017*\"charg\" + 0.014*\"murder\" + 0.011*\"court\" + 0.011*\"polic\" + 0.009*\"woman\" + 0.008*\"assault\" + 0.008*\"jail\" + 0.008*\"alleg\" + 0.007*\"accus\" + 0.007*\"guilti\"\n"
     ]
    }
   ],
   "source": [
    "for idx, topic in lda_model_tfidf.print_topics(-1):\n",
    "    print('Topic: {} Word: {}'.format(idx, topic))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Classification of the topics"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Performance evaluation by classifying sample document using LDA Bag of Words model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['rain', 'help', 'dampen', 'bushfir']"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "processed_docs[4310]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Score: 0.41997694969177246\t \n",
      "Topic: 0.017*\"attack\" + 0.016*\"kill\" + 0.012*\"victim\" + 0.012*\"violenc\" + 0.010*\"hobart\" + 0.010*\"rugbi\" + 0.010*\"secur\" + 0.010*\"say\" + 0.009*\"state\" + 0.008*\"domest\"\n",
      "\n",
      "Score: 0.21999986469745636\t \n",
      "Topic: 0.023*\"world\" + 0.014*\"final\" + 0.013*\"record\" + 0.012*\"break\" + 0.011*\"lose\" + 0.011*\"australian\" + 0.011*\"leagu\" + 0.011*\"test\" + 0.010*\"australia\" + 0.010*\"hill\"\n",
      "\n",
      "Score: 0.21999594569206238\t \n",
      "Topic: 0.027*\"south\" + 0.024*\"year\" + 0.020*\"interview\" + 0.020*\"north\" + 0.019*\"jail\" + 0.018*\"west\" + 0.014*\"island\" + 0.013*\"australia\" + 0.013*\"victoria\" + 0.010*\"china\"\n",
      "\n",
      "Score: 0.020009687170386314\t \n",
      "Topic: 0.018*\"rural\" + 0.018*\"council\" + 0.015*\"fund\" + 0.014*\"plan\" + 0.013*\"health\" + 0.012*\"chang\" + 0.011*\"nation\" + 0.010*\"price\" + 0.010*\"servic\" + 0.009*\"say\"\n",
      "\n",
      "Score: 0.020008400082588196\t \n",
      "Topic: 0.024*\"countri\" + 0.021*\"hour\" + 0.020*\"australian\" + 0.019*\"warn\" + 0.016*\"live\" + 0.013*\"indigen\" + 0.011*\"call\" + 0.009*\"victorian\" + 0.009*\"campaign\" + 0.008*\"show\"\n",
      "\n",
      "Score: 0.02000494673848152\t \n",
      "Topic: 0.031*\"queensland\" + 0.029*\"melbourn\" + 0.018*\"water\" + 0.017*\"claim\" + 0.013*\"hunter\" + 0.012*\"green\" + 0.012*\"resid\" + 0.011*\"darwin\" + 0.010*\"young\" + 0.009*\"plead\"\n",
      "\n",
      "Score: 0.020004209131002426\t \n",
      "Topic: 0.052*\"polic\" + 0.020*\"crash\" + 0.019*\"death\" + 0.017*\"sydney\" + 0.016*\"miss\" + 0.016*\"woman\" + 0.015*\"die\" + 0.015*\"charg\" + 0.014*\"shoot\" + 0.013*\"arrest\"\n",
      "\n",
      "Score: 0.019999999552965164\t \n",
      "Topic: 0.035*\"govern\" + 0.024*\"open\" + 0.018*\"coast\" + 0.017*\"tasmanian\" + 0.017*\"gold\" + 0.014*\"australia\" + 0.013*\"beat\" + 0.010*\"win\" + 0.010*\"ahead\" + 0.009*\"shark\"\n",
      "\n",
      "Score: 0.019999999552965164\t \n",
      "Topic: 0.025*\"elect\" + 0.022*\"adelaid\" + 0.012*\"perth\" + 0.011*\"take\" + 0.011*\"say\" + 0.010*\"labor\" + 0.010*\"turnbul\" + 0.009*\"vote\" + 0.009*\"royal\" + 0.009*\"time\"\n",
      "\n",
      "Score: 0.019999999552965164\t \n",
      "Topic: 0.032*\"court\" + 0.022*\"face\" + 0.020*\"charg\" + 0.020*\"home\" + 0.018*\"tasmania\" + 0.017*\"murder\" + 0.015*\"trial\" + 0.012*\"accus\" + 0.012*\"abus\" + 0.012*\"child\"\n"
     ]
    }
   ],
   "source": [
    "for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):\n",
    "    print(\"\\nScore: {}\\t \\nTopic: {}\".format(score, lda_model.print_topic(index, 10)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Our test document has the highest probability to be part of the topic on the top."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Performance evaluation by classifying sample document using LDA TF-IDF model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Score: 0.44014573097229004\t \n",
      "Topic: 0.014*\"south\" + 0.009*\"weather\" + 0.009*\"north\" + 0.008*\"west\" + 0.008*\"coast\" + 0.008*\"australia\" + 0.006*\"east\" + 0.006*\"queensland\" + 0.006*\"storm\" + 0.005*\"season\"\n",
      "\n",
      "Score: 0.3998423218727112\t \n",
      "Topic: 0.023*\"rural\" + 0.018*\"govern\" + 0.013*\"news\" + 0.012*\"podcast\" + 0.008*\"grandstand\" + 0.008*\"health\" + 0.007*\"budget\" + 0.007*\"busi\" + 0.007*\"nation\" + 0.007*\"fund\"\n",
      "\n",
      "Score: 0.02000250481069088\t \n",
      "Topic: 0.006*\"action\" + 0.006*\"violenc\" + 0.006*\"thursday\" + 0.005*\"domest\" + 0.005*\"cancer\" + 0.005*\"legal\" + 0.005*\"union\" + 0.005*\"breakfast\" + 0.005*\"school\" + 0.004*\"student\"\n",
      "\n",
      "Score: 0.020002111792564392\t \n",
      "Topic: 0.008*\"drum\" + 0.007*\"abbott\" + 0.007*\"farm\" + 0.006*\"dairi\" + 0.006*\"asylum\" + 0.006*\"tuesday\" + 0.006*\"water\" + 0.006*\"labor\" + 0.006*\"say\" + 0.005*\"plan\"\n",
      "\n",
      "Score: 0.020001791417598724\t \n",
      "Topic: 0.030*\"countri\" + 0.028*\"hour\" + 0.009*\"sport\" + 0.008*\"septemb\" + 0.008*\"wednesday\" + 0.007*\"commiss\" + 0.006*\"royal\" + 0.006*\"updat\" + 0.006*\"station\" + 0.005*\"bendigo\"\n",
      "\n",
      "Score: 0.02000163123011589\t \n",
      "Topic: 0.008*\"octob\" + 0.006*\"search\" + 0.006*\"miss\" + 0.006*\"inquest\" + 0.005*\"stori\" + 0.005*\"jam\" + 0.004*\"john\" + 0.004*\"harvest\" + 0.004*\"australia\" + 0.004*\"world\"\n",
      "\n",
      "Score: 0.020001478493213654\t \n",
      "Topic: 0.008*\"monday\" + 0.008*\"august\" + 0.006*\"babi\" + 0.005*\"shorten\" + 0.005*\"hobart\" + 0.004*\"victorian\" + 0.004*\"donald\" + 0.004*\"safe\" + 0.004*\"scott\" + 0.004*\"donat\"\n",
      "\n",
      "Score: 0.02000102959573269\t \n",
      "Topic: 0.017*\"charg\" + 0.014*\"murder\" + 0.011*\"court\" + 0.011*\"polic\" + 0.009*\"woman\" + 0.008*\"assault\" + 0.008*\"jail\" + 0.008*\"alleg\" + 0.007*\"accus\" + 0.007*\"guilti\"\n",
      "\n",
      "Score: 0.020000804215669632\t \n",
      "Topic: 0.022*\"interview\" + 0.013*\"market\" + 0.009*\"share\" + 0.008*\"cattl\" + 0.008*\"trump\" + 0.008*\"turnbul\" + 0.007*\"novemb\" + 0.007*\"michael\" + 0.006*\"australian\" + 0.006*\"export\"\n",
      "\n",
      "Score: 0.020000625401735306\t \n",
      "Topic: 0.019*\"crash\" + 0.014*\"kill\" + 0.009*\"fatal\" + 0.009*\"dead\" + 0.007*\"die\" + 0.007*\"truck\" + 0.007*\"polic\" + 0.006*\"attack\" + 0.006*\"injur\" + 0.006*\"bomb\"\n"
     ]
    }
   ],
   "source": [
    "for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):\n",
    "    print(\"\\nScore: {}\\t \\nTopic: {}\".format(score, lda_model_tfidf.print_topic(index, 10)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Our test document has the highest probability to be part of the topic on the top."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Testing model on unseen document"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Score: 0.3500000238418579\t Topic: 0.017*\"attack\" + 0.016*\"kill\" + 0.012*\"victim\" + 0.012*\"violenc\" + 0.010*\"hobart\"\n",
      "Score: 0.34998345375061035\t Topic: 0.025*\"elect\" + 0.022*\"adelaid\" + 0.012*\"perth\" + 0.011*\"take\" + 0.011*\"say\"\n",
      "Score: 0.18333327770233154\t Topic: 0.052*\"polic\" + 0.020*\"crash\" + 0.019*\"death\" + 0.017*\"sydney\" + 0.016*\"miss\"\n",
      "Score: 0.01667369343340397\t Topic: 0.035*\"govern\" + 0.024*\"open\" + 0.018*\"coast\" + 0.017*\"tasmanian\" + 0.017*\"gold\"\n",
      "Score: 0.01666990853846073\t Topic: 0.027*\"south\" + 0.024*\"year\" + 0.020*\"interview\" + 0.020*\"north\" + 0.019*\"jail\"\n",
      "Score: 0.016669215634465218\t Topic: 0.018*\"rural\" + 0.018*\"council\" + 0.015*\"fund\" + 0.014*\"plan\" + 0.013*\"health\"\n",
      "Score: 0.016668815165758133\t Topic: 0.024*\"countri\" + 0.021*\"hour\" + 0.020*\"australian\" + 0.019*\"warn\" + 0.016*\"live\"\n",
      "Score: 0.01666823774576187\t Topic: 0.031*\"queensland\" + 0.029*\"melbourn\" + 0.018*\"water\" + 0.017*\"claim\" + 0.013*\"hunter\"\n",
      "Score: 0.016666674986481667\t Topic: 0.032*\"court\" + 0.022*\"face\" + 0.020*\"charg\" + 0.020*\"home\" + 0.018*\"tasmania\"\n",
      "Score: 0.01666666753590107\t Topic: 0.023*\"world\" + 0.014*\"final\" + 0.013*\"record\" + 0.012*\"break\" + 0.011*\"lose\"\n"
     ]
    }
   ],
   "source": [
    "unseen_document = 'How a Pentagon deal became an identity crisis for Google'\n",
    "bow_vector = dictionary.doc2bow(preprocess(unseen_document))\n",
    "\n",
    "for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):\n",
    "    print(\"Score: {}\\t Topic: {}\".format(score, lda_model.print_topic(index, 5)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
